Latent Variables active in Random Forests

First we get the metaviper predictions, LV scores, and Random Forest weights from Synapse. We filter for LVs that are selected by the random forest.

#get immune predictions
dtab<-synapser::synTableQuery(paste('select * from',mp_scores))$asDataFrame()%>%
  subset(isCellLine!='TRUE')
## 
 [####################]100.00%   1/1   Done...    
Downloading  [##------------------]7.96%   2.0MB/25.1MB (3.5MB/s) Job-103108778937140675737338019.csv     
Downloading  [###-----------------]15.92%   4.0MB/25.1MB (5.1MB/s) Job-103108778937140675737338019.csv     
Downloading  [#####---------------]23.88%   6.0MB/25.1MB (6.1MB/s) Job-103108778937140675737338019.csv     
Downloading  [######--------------]31.85%   8.0MB/25.1MB (6.7MB/s) Job-103108778937140675737338019.csv     
Downloading  [########------------]39.81%   10.0MB/25.1MB (7.0MB/s) Job-103108778937140675737338019.csv     
Downloading  [##########----------]47.77%   12.0MB/25.1MB (7.4MB/s) Job-103108778937140675737338019.csv     
Downloading  [###########---------]55.73%   14.0MB/25.1MB (7.6MB/s) Job-103108778937140675737338019.csv     
Downloading  [#############-------]63.69%   16.0MB/25.1MB (7.8MB/s) Job-103108778937140675737338019.csv     
Downloading  [##############------]71.65%   18.0MB/25.1MB (8.0MB/s) Job-103108778937140675737338019.csv     
Downloading  [################----]79.61%   20.0MB/25.1MB (8.0MB/s) Job-103108778937140675737338019.csv     
Downloading  [##################--]87.58%   22.0MB/25.1MB (7.8MB/s) Job-103108778937140675737338019.csv     
Downloading  [###################-]95.54%   24.0MB/25.1MB (7.8MB/s) Job-103108778937140675737338019.csv     
Downloading  [####################]100.00%   25.1MB/25.1MB (7.7MB/s) Job-103108778937140675737338019.csv Done...
##get metaviper scores
mtab<-synapser::synTableQuery(paste('select * from',metaviper_scores))$asDataFrame()
## 
Building the CSV... [#-------------------]3.28%   48152/1467984       
Building the CSV... [#-------------------]7.43%   109077/1467984       
Building the CSV... [###-----------------]16.36%   240110/1467984       
Building the CSV... [#####---------------]24.91%   365629/1467984       
Building the CSV... [######--------------]28.99%   425640/1467984       
Building the CSV... [########------------]38.01%   558026/1467984       
Create CSV FileHandle [##########----------]50.00%   733995/1467984       
Create CSV FileHandle [####################]100.00%   1467984/1467984   Done...    
Downloading  [--------------------]2.47%   2.0MB/81.0MB (3.6MB/s) Job-103108782087876564897722437.csv     
Downloading  [#-------------------]4.94%   4.0MB/81.0MB (5.0MB/s) Job-103108782087876564897722437.csv     
Downloading  [#-------------------]7.40%   6.0MB/81.0MB (6.1MB/s) Job-103108782087876564897722437.csv     
Downloading  [##------------------]9.87%   8.0MB/81.0MB (6.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [##------------------]12.34%   10.0MB/81.0MB (7.1MB/s) Job-103108782087876564897722437.csv     
Downloading  [###-----------------]14.81%   12.0MB/81.0MB (7.4MB/s) Job-103108782087876564897722437.csv     
Downloading  [###-----------------]17.27%   14.0MB/81.0MB (7.6MB/s) Job-103108782087876564897722437.csv     
Downloading  [####----------------]19.74%   16.0MB/81.0MB (7.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [####----------------]22.21%   18.0MB/81.0MB (8.0MB/s) Job-103108782087876564897722437.csv     
Downloading  [#####---------------]24.68%   20.0MB/81.0MB (8.1MB/s) Job-103108782087876564897722437.csv     
Downloading  [#####---------------]27.14%   22.0MB/81.0MB (8.2MB/s) Job-103108782087876564897722437.csv     
Downloading  [######--------------]29.61%   24.0MB/81.0MB (8.4MB/s) Job-103108782087876564897722437.csv     
Downloading  [######--------------]32.08%   26.0MB/81.0MB (8.5MB/s) Job-103108782087876564897722437.csv     
Downloading  [#######-------------]34.55%   28.0MB/81.0MB (8.6MB/s) Job-103108782087876564897722437.csv     
Downloading  [#######-------------]37.01%   30.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [########------------]39.48%   32.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [########------------]41.95%   34.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [#########-----------]44.42%   36.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [#########-----------]46.88%   38.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [##########----------]49.35%   40.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [##########----------]51.82%   42.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [###########---------]54.29%   44.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [###########---------]56.76%   46.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [############--------]59.22%   48.0MB/81.0MB (8.7MB/s) Job-103108782087876564897722437.csv     
Downloading  [############--------]61.69%   50.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [#############-------]64.16%   52.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [#############-------]66.63%   54.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [##############------]69.09%   56.0MB/81.0MB (8.9MB/s) Job-103108782087876564897722437.csv     
Downloading  [##############------]71.56%   58.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [###############-----]74.03%   60.0MB/81.0MB (8.9MB/s) Job-103108782087876564897722437.csv     
Downloading  [###############-----]76.50%   62.0MB/81.0MB (8.9MB/s) Job-103108782087876564897722437.csv     
Downloading  [################----]78.96%   64.0MB/81.0MB (9.0MB/s) Job-103108782087876564897722437.csv     
Downloading  [################----]81.43%   66.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [#################---]83.90%   68.0MB/81.0MB (8.8MB/s) Job-103108782087876564897722437.csv     
Downloading  [#################---]86.37%   70.0MB/81.0MB (8.9MB/s) Job-103108782087876564897722437.csv     
Downloading  [##################--]88.83%   72.0MB/81.0MB (8.9MB/s) Job-103108782087876564897722437.csv     
Downloading  [##################--]91.30%   74.0MB/81.0MB (8.9MB/s) Job-103108782087876564897722437.csv     
Downloading  [###################-]93.77%   76.0MB/81.0MB (9.0MB/s) Job-103108782087876564897722437.csv     
Downloading  [###################-]96.24%   78.0MB/81.0MB (9.0MB/s) Job-103108782087876564897722437.csv     
Downloading  [####################]98.70%   80.0MB/81.0MB (9.0MB/s) Job-103108782087876564897722437.csv     
Downloading  [####################]100.00%   81.0MB/81.0MB (9.0MB/s) Job-103108782087876564897722437.csv Done...
##get rf loadings
rftab<-synapser::synTableQuery(paste('select * from',rf_mp))$asDataFrame()%>%
  select(LV_Full,`Cutaneous Neurofibroma`,`Neurofibroma`,`Malignant Peripheral Nerve Sheath Tumor`,`Plexiform Neurofibroma`)%>%
  mutate(latent_var=gsub('`','',LV_Full))%>%
  select(-LV_Full)
## 
 [####################]100.00%   1/1   Done...    
Downloading  [####################]100.00%   79.8kB/79.8kB (546.9kB/s) Job-103108861380691698548384651.csv Done...
samps<-intersect(dtab$specimenID,mtab$specimenID)

#get RF-selected latent variables
lvs<-synTableQuery("select * from syn21315356")$asDataFrame()$LatentVar
## 
 [####################]100.00%   1/1   Done...    
Downloading  [####################]100.00%   2.2kB/2.2kB (2.8MB/s) Job-103108872313621925498032413.csv Done...
mp_res<-dtab%>%
  subset(specimenID%in%samps)%>%
  subset(latent_var%in%lvs)%>%
#  group_by(latent_var) %>%
#  mutate(sd_value = sd(value)) %>%
#  filter(sd_value > 0.05) %>%
#  ungroup()%>%
  select(latent_var,value,specimenID,diagnosis)%>%
  left_join(rftab,by='latent_var')

combined<-mp_res%>%ungroup()%>%inner_join(mtab,by='specimenID')

#now compute some basic stats
mp_stats<-mp_res%>%
  rowwise()%>%mutate(MaxVal=max(`Cutaneous Neurofibroma`,`Plexiform Neurofibroma`,`Malignant Peripheral Nerve Sheath Tumor`,Neurofibroma))%>%
  rowwise()%>%
  mutate(MeanVal=mean(c(`Cutaneous Neurofibroma`,`Plexiform Neurofibroma`,`Malignant Peripheral Nerve Sheath Tumor`,Neurofibroma)))

DT::datatable(mp_stats)

Plotting protein correlations

With the RF-selected LVs for each random forest prediction, we can plot those metaviper proteins that correlate with them.

corVals=combined%>%#subset(latent_var%in%unique(unlist(top10)))%>%
    group_by(latent_var,gene)%>%
  summarize(corVal=cor(value,metaviperscore,use='pairwise.complete.obs'),numSamps=n_distinct(specimenID))

corVals
## # A tibble: 627,476 x 4
## # Groups:   latent_var [103]
##    latent_var               gene    corVal numSamps
##    <chr>                    <chr>    <dbl>    <int>
##  1 1,REACTOME_MRNA_SPLICING AATF    0.436        77
##  2 1,REACTOME_MRNA_SPLICING ABCA1  -0.570        77
##  3 1,REACTOME_MRNA_SPLICING ABCC8  -0.327        77
##  4 1,REACTOME_MRNA_SPLICING ABCC9  -0.619        77
##  5 1,REACTOME_MRNA_SPLICING ABCG1  -0.521        77
##  6 1,REACTOME_MRNA_SPLICING ABCG4   0.239        77
##  7 1,REACTOME_MRNA_SPLICING ABI1   -0.356        77
##  8 1,REACTOME_MRNA_SPLICING ABL1   -0.0887       77
##  9 1,REACTOME_MRNA_SPLICING ABL2   -0.318        77
## 10 1,REACTOME_MRNA_SPLICING ABLIM3 -0.652        77
## # … with 627,466 more rows
##let's store this in Synapse
tab<-synBuildTable('Top10 Metaviper Latent-Variable Correlations',parent='syn21046734',corVals)
synStore(tab)
## 
Uploading [--------------------]0.00%   0.0bytes/27.6MB  filef9c923e9f5fb     
Uploading [######--------------]28.96%   8.0MB/27.6MB (760.2kB/s) filef9c923e9f5fb     
Uploading [############--------]57.92%   16.0MB/27.6MB (761.6kB/s) filef9c923e9f5fb     
Uploading [#################---]86.89%   24.0MB/27.6MB (765.1kB/s) filef9c923e9f5fb     
Uploading [####################]100.00%   27.6MB/27.6MB (684.6kB/s) filef9c923e9f5fb Done...    
 [####################]100.00%   1/1   Done...
## <synapseclient.table.CsvFileTable object at 0x11842d588>
#corVals<-corVals%>%subset(latent_var%in%unique(unlist(top10)))
##now how do we bracket them?
##plot correlation distributions by cell type and method.
require(ggplot2)

##first re-order variables to plot
top.df<-mp_stats%>%rename(All='MeanVal')%>%gather(key="disease",value="pathway")

p<-corVals%>%
              ungroup()%>%
  subset(latent_var%in%unique(unlist(lvs)))%>%
          #    mutate(LatentVariable = stringr::str_trim(as.character(latent_var), 20))%>%
              ggplot()+geom_boxplot(aes(x=latent_var,y=corVal))+ theme(axis.text.x = element_text(angle = 45, hjust = 1))+ggtitle("Correlation of metaviper proteins with lv")
print(p)

There are some proteins that show up as highly correlated. By choosing a threshold, we can evaluate what they are in more detail.

These plots represent the top 10 latent variables for a predictor of each tumor type and the proteins that are correlated with them.

corthresh=0.75

##now filter to the cell types with correlated proteins
cor_cell_types=subset(corVals,corVal>corthresh)%>%
  subset(latent_var%in%unique(unlist(lvs)))%>%
      ungroup()%>%
  select(latent_var)%>%
  distinct()

print(paste('we found',nrow(cor_cell_types),'lvs with some protein correlation greater than',corthresh))
## [1] "we found 64 lvs with some protein correlation greater than 0.75"
DT::datatable(cor_cell_types)
apply(cor_cell_types,1,function(x){
  ct=x[['latent_var']]
#  m=x[['method']]

  #for each gene and cell type
  genes=subset(corVals,latent_var==ct)%>%
        subset(corVal>corthresh)%>%
   arrange(desc(corVal))%>%
      ungroup()

    if(nrow(genes)>12){
    new.corthresh=format(genes$corVal[12],digits=3)
    genes=genes[1:12,]
  }else{
    new.corthresh=corthresh
  }

  scores=subset(combined,gene%in%genes$gene)%>%subset(latent_var==ct)

  p2<- ggplot(scores)+
      geom_point(aes(x=value,y=metaviperscore,
          col=gene,shape=tumorType))+
  #  scale_x_log10()+
      ggtitle(paste(ct,'correlation >',new.corthresh,'\n',
        subset(top.df,pathway==ct)%>%select(disease)%>%unlist()%>%paste(collapse=',')))
  print(p2)
 # ggsave(paste0(m,'predictions of',gsub(" ","",gsub("/","",ct)),'cor',new.corthresh,'.pdf'))
})

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

#parentid='syn20710537'
#for(fi in list.files('.')[grep('tions',list.files('.'))])
#  synapser::synStore(synapser::File(fi,parentId=parentid,annotations=list(resourceType='analysis',isMultiSpecimen='TRUE',isMultiIndividual='TRUE')),used=c(deconv_scores,metaviper_scores),executed=this.script)